Module 15

Multiple Linear Regression- looks at the relationship between each of two or more continous predictor variabbles and a continuous repsonse variable while holding the effect of all other predictor variables constant

ANCOVA- looking at the relationship between one or more continuous predictor variables and a continuous repsonse variable within each of one or more categorical groups

The ideal situation is that each variable does not correlate with one another at all (this would be 0 co-variance)

To experiment with multiple linear regressions, lets construct a dataset of random numbers

R = matrix(cbind(1, 0.8, -0.5, 0, 0.8, 1, -0.3, 0.3, -0.5, -0.3, 1, 0.6, 0, 
    0.3, 0.6, 1), nrow = 4)
n <- 1000
k <- 4
M <- NULL
V <- NULL
mu <- c(15, 40, 5, 23)  # vector of variable means
s <- c(5, 20, 4, 15)  # vector of variable SDs
for (i in 1:k) {
    V <- rnorm(n, mu[i], s[i])
    M <- cbind(M, V)
}
M <- matrix(M, nrow = n, ncol = k)
orig <- as.data.frame(M)
names(orig) = c("Y", "X1", "X2", "X3")
head(orig)
##          Y       X1        X2         X3
## 1 10.05232 24.23691 12.833636 -15.902132
## 2 13.60772 33.48114  7.002115  39.440165
## 3 12.27792 42.10686 14.825860  11.298098
## 4 17.93078 31.17851  3.524137  40.305653
## 5 21.22642 19.19276  5.560270   0.362382
## 6 16.87792 16.23798  6.220589  47.502624

This function cor() lets you determine the co-variance. Remember, 1.0 is perfectly covariated and 0.0 is not correlated at all

cor(orig)  # variables are uncorrelated
##              Y          X1          X2          X3
## Y   1.00000000 -0.01910876  0.00889830  0.01091856
## X1 -0.01910876  1.00000000  0.03326950 -0.01627245
## X2  0.00889830  0.03326950  1.00000000 -0.03805011
## X3  0.01091856 -0.01627245 -0.03805011  1.00000000
plot(orig)  # does quick bivariate plots for each pair of variables; using `pairs(orig)` would do the same

#### Now, let’s normalize and standardize our variables by subtracting the relevant means and dividing by the standard deviation. This converts them to Z scores from a standard normal distribution.

ms <- apply(orig, 2, FUN = "mean")  # returns a vector of means, where we are taking this across dimension 2 of the array 'orig'
ms
##         Y        X1        X2        X3 
## 14.972722 40.047105  4.998087 23.333006
sds <- apply(orig, 2, FUN = "sd")
sds
##         Y        X1        X2        X3 
##  4.802269 19.802031  4.072626 15.244840
normalized <- sweep(orig, 2, STATS = ms, FUN = "-")  # 2nd dimension is columns, removing array of means, function = subtract
normalized <- sweep(normalized, 2, STATS = sds, FUN = "/")  # 2nd dimension is columns, scaling by array of sds, function = divide
head(normalized)  # now a dataframe of Z scores
##            Y         X1         X2         X3
## 1 -1.0245995 -0.7984129  1.9239553 -2.5736667
## 2 -0.2842420 -0.3315806  0.4920727  1.0565647
## 3 -0.5611527  0.1040176  2.4131296 -0.7894414
## 4  0.6159698 -0.4478627 -0.3619163  1.1133371
## 5  1.3022373 -1.0531417  0.1380394 -1.5067802
## 6  0.3967283 -1.2023578  0.3001755  1.5854294
M <- as.matrix(normalized)  # redefine M as our matrix of normalized variables

With apply() we apply a function to the specified margin of an array or matrix, and with sweep() we then perform whatever function is specified on all of the elements in an array specified by the given margin.

Now use Cholesky Decomposition… weird name…

U = chol(R)
newM = M %*% U
new = as.data.frame(newM)
names(new) = c("Y", "X1", "X2", "X3")
cor(new)  # note that is correlation matrix is what we are aiming for!
##               Y         X1         X2          X3
## Y   1.000000000  0.7958684 -0.4943766 0.002613573
## X1  0.795868402  1.0000000 -0.2756530 0.313944956
## X2 -0.494376644 -0.2756530  1.0000000 0.598618032
## X3  0.002613573  0.3139450  0.5986180 1.000000000
plot(orig)

plot(new)

Finally, we can scale these back out to the mean and distribution of our original random variables. This is using the sweep function in reverse- the values are not standardized

df <- sweep(new, 2, STATS = sds, FUN = "*")  # scale back out to original mean...
df <- sweep(df, 2, STATS = ms, FUN = "+")  # and standard deviation
head(df)
##          Y       X1        X2        X3
## 1 10.05232 14.32967 13.201489 10.880472
## 2 13.60772 31.60467  7.054921 35.300357
## 3 12.27792 32.39339 14.563374 39.068799
## 4 17.93078 44.48391  2.187167 27.032812
## 5 21.22642 48.16405  2.109246  2.417693
## 6 16.87792 32.04645  4.413026 31.856923
cor(df)
##               Y         X1         X2          X3
## Y   1.000000000  0.7958684 -0.4943766 0.002613573
## X1  0.795868402  1.0000000 -0.2756530 0.313944956
## X2 -0.494376644 -0.2756530  1.0000000 0.598618032
## X3  0.002613573  0.3139450  0.5986180 1.000000000
plot(df)

CHALLENGE: Start off by making some bivariate scatterplots in {ggplot2}. Then, using simple linear regression as implemented with lm(), how does the response variable (Y) vary with each predictor variable (X1, X2, X3)? Are the β1 coefficients significant? How much of the variation in YY does each predictor explain in a simple bivariate linear model?

library(ggplot2)
require(gridExtra)
## Loading required package: gridExtra
g1 <- ggplot(data = df, aes(x = X1, y = Y)) + geom_point() + geom_smooth(method = "lm", formula = y ~ x)
g2 <- ggplot(data = df, aes(x = X2, y = Y)) + geom_point() + geom_smooth(method = "lm",  formula = y ~ x)
g3 <- ggplot(data = df, aes(x = X3, y = Y)) + geom_point() + geom_smooth(method = "lm",  formula = y ~ x)
grid.arrange(g1, g2, g3, ncol = 3)

m1 <- lm(data = df, formula = Y ~ X1)
summary(m1)
## 
## Call:
## lm(formula = Y ~ X1, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.917 -2.095  0.147  1.990  8.384 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.171377   0.209183   34.28   <2e-16 ***
## X1          0.194804   0.004691   41.52   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.909 on 998 degrees of freedom
## Multiple R-squared:  0.6334, Adjusted R-squared:  0.633 
## F-statistic:  1724 on 1 and 998 DF,  p-value: < 2.2e-16
m2 <- lm(data = df, formula = Y ~ X2)
summary(m2)
## 
## Call:
## lm(formula = Y ~ X2, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.7947  -2.8163  -0.0116   2.7366  13.4699 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 17.87902    0.20882   85.62   <2e-16 ***
## X2          -0.58148    0.03236  -17.97   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.176 on 998 degrees of freedom
## Multiple R-squared:  0.2444, Adjusted R-squared:  0.2437 
## F-statistic: 322.8 on 1 and 998 DF,  p-value: < 2.2e-16
m3 <- lm(data = df, formula = Y ~ X3)
summary(m3)
## 
## Call:
## lm(formula = Y ~ X3, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.2551  -3.3741  -0.0604   3.3342  16.0607 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 14.953334   0.279693  53.463   <2e-16 ***
## X3           0.000831   0.010064   0.083    0.934    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.805 on 998 degrees of freedom
## Multiple R-squared:  6.831e-06,  Adjusted R-squared:  -0.0009952 
## F-statistic: 0.006817 on 1 and 998 DF,  p-value: 0.9342

To review, with multiple regression, we are looking to model a response variable in terms of two or more predictor variables so we can evaluate the effect of several explanatory variables.

Using lm() and formula notation, we can fit a model with all three predictor variables. The + sign is used to add additional predictors to our model.

m <- lm(data = df, formula = Y ~ X1 + X2 + X3)
coef(m)
## (Intercept)          X1          X2          X3 
##   9.5023647   0.1887012  -0.2539637  -0.0350254
summary(m)
## 
## Call:
## lm(formula = Y ~ X1 + X2 + X3, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.0153 -1.8111  0.0204  1.7561  8.8276 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.502365   0.242499  39.185  < 2e-16 ***
## X1           0.188701   0.005452  34.609  < 2e-16 ***
## X2          -0.253964   0.031055  -8.178 8.73e-16 ***
## X3          -0.035025   0.008499  -4.121 4.08e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.545 on 996 degrees of freedom
## Multiple R-squared:   0.72,  Adjusted R-squared:  0.7192 
## F-statistic: 853.8 on 3 and 996 DF,  p-value: < 2.2e-16
# let's check if our residuals are random normal...
plot(fitted(m), residuals(m))

#### You want your fitted model and residuals plot to show a random distribution. This means that all the other variation that your model is not accounting for is random (and not capable of being explained by some other non-accounted for variable).

hist(residuals(m))

qqnorm(residuals(m))

#### What does this output tell us? First off, the results of the omnibus F test tells us that the overall model is significant; using these three variables, we explain signficantly more of the variation in the response variable, Y, than we would using a model with just an intercept, i.e., just that Y = mean(Y).

You can calculate the F-statistic by hand:

f <- (summary(m)$r.squared * (nrow(df) - (ncol(df) - 1) - 1))/((1 - summary(m)$r.squared) * 
    (ncol(df) - 1))
f
## [1] 853.8013

Second, looking at summary() we see that the ββ coefficient for each of our predictor variables (including X3X3) is significant. That is, each predictor is significant even when the effects of the other predictor variables are held constant. Recall that in the simple linear regression, the ββ coefficient for X3X3 was not significant.

Third, we can interpret our ββ coefficients as we did in simple linear regression… for each change of one unit in a particular predictor variable (holding the other predictors constant), our predicted value of the response variable changes ββ units.

CHALLENGE: Load up the “zombies.csv” dataset again and run a linear model of height as a function of both weight and age. Is the overall model significant? Are both predictor variables significant when the other is controlled for?

library(curl)
f <- curl("https://raw.githubusercontent.com/fuzzyatelin/fuzzyatelin.github.io/master/AN597_Fall17/zombies.csv")
z <- read.csv(f, header = TRUE, sep = ",", stringsAsFactors = TRUE)
head(z)
##   id first_name last_name gender   height   weight zombies_killed
## 1  1      Sarah    Little Female 62.88951 132.0872              2
## 2  2       Mark    Duncan   Male 67.80277 146.3753              5
## 3  3    Brandon     Perez   Male 72.12908 152.9370              1
## 4  4      Roger   Coleman   Male 66.78484 129.7418              5
## 5  5      Tammy    Powell Female 64.71832 132.4265              4
## 6  6    Anthony     Green   Male 71.24326 152.5246              1
##   years_of_education                           major      age
## 1                  1                medicine/nursing 17.64275
## 2                  3 criminal justice administration 22.58951
## 3                  1                       education 21.91276
## 4                  6                  energy studies 18.19058
## 5                  3                       logistics 21.10399
## 6                  4                  energy studies 21.48355
m <- lm(data = z, height ~ weight + age)
summary(m)
## 
## Call:
## lm(formula = height ~ weight + age, data = z)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.2278 -1.1782 -0.0574  1.1566  5.4117 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 31.763388   0.470797   67.47   <2e-16 ***
## weight       0.163107   0.002976   54.80   <2e-16 ***
## age          0.618270   0.018471   33.47   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.64 on 997 degrees of freedom
## Multiple R-squared:  0.8555, Adjusted R-squared:  0.8553 
## F-statistic:  2952 on 2 and 997 DF,  p-value: < 2.2e-16

We can use the same linear modeling approach to do analysis of covariance, where we have a continuous response variable and a combination of continuous and categorical predictor variables. Let’s return to our “zombies.csv” dataset and now include one continuous and one categorical variable in our model… we want to predict height as a function of age and gender, and we want to use Type II regression. What is our model formula?

library(car)
m <- lm(data = z, formula = height ~ gender + age)
summary(m)
## 
## Call:
## lm(formula = height ~ gender + age, data = z)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.1909  -1.7173   0.1217   1.7670   7.6746 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 46.74251    0.56869   82.19   <2e-16 ***
## genderMale   4.00224    0.16461   24.31   <2e-16 ***
## age          0.94091    0.02777   33.88   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.603 on 997 degrees of freedom
## Multiple R-squared:  0.6361, Adjusted R-squared:  0.6354 
## F-statistic: 871.5 on 2 and 997 DF,  p-value: < 2.2e-16
m.aov <- Anova(m, type = "II")
m.aov # Use Type II Error
## Anova Table (Type II tests)
## 
## Response: height
##           Sum Sq  Df F value    Pr(>F)    
## gender    4003.9   1  591.15 < 2.2e-16 ***
## age       7775.6   1 1148.01 < 2.2e-16 ***
## Residuals 6752.7 997                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(fitted(m), residuals(m))

hist(residuals(m))

qqnorm(residuals(m))

#### To visualize it…

library(ggplot2)
p <- ggplot(data = z, aes(x = age, y = height)) + geom_point(aes(color = factor(gender))) + 
    scale_color_manual(values = c("goldenrod", "blue"))
p <- p + geom_abline(slope = m$coefficients[3], intercept = m$coefficients[1], 
    color = "goldenrod4")
p <- p + geom_abline(slope = m$coefficients[3], intercept = m$coefficients[1] + 
    m$coefficients[2], color = "darkblue")
p

#### Using the confint() function on our ANCOVA model results reveals the confidence intervals for each of the coefficients in our multiple regression, just as it did for simple regression.

m <- lm(data = z, formula = height ~ age + gender)
summary(m)
## 
## Call:
## lm(formula = height ~ age + gender, data = z)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.1909  -1.7173   0.1217   1.7670   7.6746 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 46.74251    0.56869   82.19   <2e-16 ***
## age          0.94091    0.02777   33.88   <2e-16 ***
## genderMale   4.00224    0.16461   24.31   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.603 on 997 degrees of freedom
## Multiple R-squared:  0.6361, Adjusted R-squared:  0.6354 
## F-statistic: 871.5 on 2 and 997 DF,  p-value: < 2.2e-16
confint(m, level = 0.95)
##                  2.5 %     97.5 %
## (Intercept) 45.6265330 47.8584809
## age          0.8864191  0.9954081
## genderMale   3.6792172  4.3252593

Similarly, using predict() allows us to determine confidence intervals for the predicted mean response and prediction intervals for individual responses for a given combination of predictor variables.

So far, we have only considered the joint main effects of multiple predictors on a response variable, but often there are interactive effects between our predictors. An interactive effect is an additional change in the response that occurs because of particular combinations of predictors or because the relationship of one continuous variable to a response is contingent on a particular level of a categorical variable. We explored the former case a bit when we looked at ANOVAs involving two discrete predictors. Now, we’ll consider the latter case… is there an interactive effect of sex AND age on height in our population of zombie apocalypse survivors?

Using formula notation, it is easy for us to consider interactions between predictors. The colon (:) operator allows us to specify particular interactions we want to consider; we can use the asterisk (*) operator to specify a full model, i.e., all single terms factors and their interactions.

m <- lm(data = z, height ~ age + gender + age:gender)  # or
summary(m)
## 
## Call:
## lm(formula = height ~ age + gender + age:gender, data = z)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.7985 -1.6973  0.1189  1.7662  7.9473 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    48.18107    0.79839  60.348   <2e-16 ***
## age             0.86913    0.03941  22.053   <2e-16 ***
## genderMale      1.15975    1.12247   1.033   0.3018    
## age:genderMale  0.14179    0.05539   2.560   0.0106 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.595 on 996 degrees of freedom
## Multiple R-squared:  0.6385, Adjusted R-squared:  0.6374 
## F-statistic: 586.4 on 3 and 996 DF,  p-value: < 2.2e-16
m <- lm(data = z, height ~ age * gender)
summary(m)
## 
## Call:
## lm(formula = height ~ age * gender, data = z)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.7985 -1.6973  0.1189  1.7662  7.9473 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    48.18107    0.79839  60.348   <2e-16 ***
## age             0.86913    0.03941  22.053   <2e-16 ***
## genderMale      1.15975    1.12247   1.033   0.3018    
## age:genderMale  0.14179    0.05539   2.560   0.0106 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.595 on 996 degrees of freedom
## Multiple R-squared:  0.6385, Adjusted R-squared:  0.6374 
## F-statistic: 586.4 on 3 and 996 DF,  p-value: < 2.2e-16
coefficients(m)
##    (Intercept)            age     genderMale age:genderMale 
##     48.1810741      0.8691284      1.1597481      0.1417928

CHALLENGE: Load in the “KamilarAndCooper.csv”" dataset we have used previously. Reduce the dataset to the following variables: Family, Brain_Size_Female_Mean, Body_mass_female_mean, MeanGroupSize, DayLength_km, HomeRange_km2, and Move. Fit a Model I least squares multiple linear regression model using log(HomeRange_km2) as the response variable and log(Body_mass_female_mean), log(Brain_Size_Female_Mean), MeanGroupSize, and Move as predictor variables, and view a model summary.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
f <- curl("https://raw.githubusercontent.com/fuzzyatelin/fuzzyatelin.github.io/master/AN597_Fall17/KamilarAndCooperData.csv")
d <- read.csv(f, header = TRUE, sep = ",", stringsAsFactors = TRUE)
head(d)
##               Scientific_Name          Family          Genus      Species
## 1 Allenopithecus_nigroviridis Cercopithecidae Allenopithecus nigroviridis
## 2         Allocebus_trichotis Cercopithecidae      Allocebus    trichotis
## 3           Alouatta_belzebul        Atelidae       Alouatta     belzebul
## 4             Alouatta_caraya        Atelidae       Alouatta       caraya
## 5            Alouatta_guariba        Atelidae       Alouatta      guariba
## 6           Alouatta_palliata        Atelidae       Alouatta     palliata
##   Brain_Size_Species_Mean Brain_Size_Female_Mean   Brain_size_Ref
## 1                   58.02                  53.70 Isler et al 2008
## 2                      NA                     NA                 
## 3                   52.84                  51.19 Isler et al 2008
## 4                   52.63                  47.80 Isler et al 2008
## 5                   51.70                  49.08 Isler et al 2008
## 6                   49.88                  48.04 Isler et al 2008
##   Body_mass_male_mean Body_mass_female_mean Mass_Dimorphism
## 1                6130                  3180           1.928
## 2                  92                    84           1.095
## 3                7270                  5520           1.317
## 4                6525                  4240           1.539
## 5                5800                  4550           1.275
## 6                7150                  5350           1.336
##                 Mass_Ref MeanGroupSize AdultMales AdultFemale
## 1       Isler et al 2008            NA         NA          NA
## 2 Smith and Jungers 1997          1.00       1.00         1.0
## 3       Isler et al 2008          7.00       1.00         1.0
## 4       Isler et al 2008          8.00       2.30         3.3
## 5       Isler et al 2008          6.53       1.37         2.2
## 6       Isler et al 2008         12.00       2.90         6.3
##   AdultSexRatio
## 1            NA
## 2            NA
## 3          1.00
## 4          1.43
## 5          1.61
## 6          2.17
##                                                     Social_Organization_Ref
## 1                                                                          
## 2                                                             Kappeler 1997
## 3                                                       Campbell et al 2007
## 4 van Schaik et al. 1999; Kappeler and Pereira 2003; Nunn & van Schaik 2000
## 5                                                       Campbell et al 2007
## 6 van Schaik et al. 1999; Kappeler and Pereira 2003; Nunn & van Schaik 2000
##   InterbirthInterval_d Gestation WeaningAge_d MaxLongevity_m LitterSz
## 1                   NA        NA       106.15          276.0     1.01
## 2                   NA        NA           NA             NA     1.00
## 3                   NA        NA           NA             NA       NA
## 4               337.62       187       323.16          243.6     1.01
## 5                   NA        NA           NA             NA       NA
## 6               684.37       186       495.60          300.0     1.02
##    Life_History_Ref GR_MidRangeLat_dd Precip_Mean_mm Temp_Mean_degC
## 1 Jones et al. 2009             -0.17         1574.0           25.2
## 2                              -16.59         1902.3           20.3
## 3                               -6.80         1643.5           24.9
## 4 Jones et al. 2009            -20.34         1166.4           22.9
## 5                              -21.13         1332.3           19.6
## 6 Jones et al. 2009              6.95         1852.6           23.7
##   AET_Mean_mm PET_Mean_mm       Climate_Ref HomeRange_km2
## 1      1517.8      1589.4 Jones et al. 2009            NA
## 2      1388.2      1653.7 Jones et al. 2009            NA
## 3      1286.6      1549.8 Jones et al. 2009            NA
## 4      1193.1      1404.9 Jones et al. 2009            NA
## 5      1225.7      1332.2 Jones et al. 2009          0.03
## 6      1300.0      1633.9 Jones et al. 2009          0.19
##        HomeRangeRef DayLength_km     DayLengthRef Territoriality Fruit
## 1                             NA                              NA    NA
## 2                             NA                              NA    NA
## 3                             NA                              NA  57.3
## 4                           0.40 Nunn et al. 2003             NA  23.8
## 5 Jones et al. 2009           NA                              NA   5.2
## 6 Jones et al. 2009         0.32 Nunn et al. 2003         0.6506  33.1
##   Leaves Fauna             DietRef1 Canine_Dimorphism
## 1                                               2.210
## 2                                                  NA
## 3   19.1   0.0 Campbell et al. 2007             1.811
## 4   67.7   0.0 Campbell et al. 2007             1.542
## 5   73.0   0.0 Campbell et al. 2007             1.783
## 6   56.4   0.0 Campbell et al. 2007             1.703
##   Canine_Dimorphism_Ref  Feed  Move  Rest Social  Activity_Budget_Ref
## 1   Plavcan & Ruff 2008    NA    NA    NA     NA                     
## 2                          NA    NA    NA     NA                     
## 3   Plavcan & Ruff 2008 13.75 18.75 57.30  10.00 Campbell et al. 2007
## 4   Plavcan & Ruff 2008 15.90 17.60 61.60   4.90 Campbell et al. 2007
## 5   Plavcan & Ruff 2008 18.33 14.33 64.37   3.00 Campbell et al. 2007
## 6   Plavcan & Ruff 2008 17.94 12.32 66.14   3.64 Campbell et al. 2007
d <- select(d, Brain_Size_Female_Mean, Family, Body_mass_female_mean, MeanGroupSize, 
    DayLength_km, HomeRange_km2, Move)
m <- lm(data = d, log(HomeRange_km2) ~ log(Body_mass_female_mean) + log(Brain_Size_Female_Mean) + 
    MeanGroupSize + Move)
summary(m)
## 
## Call:
## lm(formula = log(HomeRange_km2) ~ log(Body_mass_female_mean) + 
##     log(Brain_Size_Female_Mean) + MeanGroupSize + Move, data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7978 -0.6473 -0.0038  0.8807  2.1598 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -6.955853   1.957472  -3.553 0.000865 ***
## log(Body_mass_female_mean)   0.315276   0.468439   0.673 0.504153    
## log(Brain_Size_Female_Mean)  0.614460   0.591100   1.040 0.303771    
## MeanGroupSize                0.034026   0.009793   3.475 0.001095 ** 
## Move                         0.025916   0.019559   1.325 0.191441    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.127 on 48 degrees of freedom
##   (160 observations deleted due to missingness)
## Multiple R-squared:  0.6359, Adjusted R-squared:  0.6055 
## F-statistic: 20.95 on 4 and 48 DF,  p-value: 4.806e-10
plot(m$residuals)

qqnorm(m$residuals)

#### Test for normality

shapiro.test(m$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  m$residuals
## W = 0.96517, p-value = 0.1242
m <- lm(data = d, log(HomeRange_km2) ~ log(Body_mass_female_mean) + log(Brain_Size_Female_Mean) + 
    MeanGroupSize)
summary(m)
## 
## Call:
## lm(formula = log(HomeRange_km2) ~ log(Body_mass_female_mean) + 
##     log(Brain_Size_Female_Mean) + MeanGroupSize, data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7982 -0.7310 -0.0140  0.8386  3.1926 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 -5.354845   1.176046  -4.553 1.45e-05 ***
## log(Body_mass_female_mean)  -0.181627   0.311382  -0.583 0.560972    
## log(Brain_Size_Female_Mean)  1.390536   0.398840   3.486 0.000721 ***
## MeanGroupSize                0.030433   0.008427   3.611 0.000473 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.228 on 103 degrees of freedom
##   (106 observations deleted due to missingness)
## Multiple R-squared:  0.6766, Adjusted R-squared:  0.6672 
## F-statistic: 71.85 on 3 and 103 DF,  p-value: < 2.2e-16
plot(m$residuals)

qqnorm(m$residuals)

shapiro.test(m$residuals)  # no significant deviation from normal
## 
##  Shapiro-Wilk normality test
## 
## data:  m$residuals
## W = 0.99366, p-value = 0.9058